############################## Config #########################################
container: "docker://satijalab/azimuth-references:latest"

ORGANS = [
    "Adrenal", "Cerebellum", "Cerebrum", "Intestine", "Heart", "Liver", "Lung", "Muscle", 
    "Pancreas", "Placenta", "Eye", "Spleen", "Stomach", "Thymus"
]

############################## All  ###########################################
rule all:
    input:
        "reference/ref.Rds",
        "reference/idx.annoy",
        "seurat_objects/fullref.Rds",
        "reference/bbi_query.rds",
        "reference/asp.rds",
        "reference/lind.rds",
        "reference/enge.rds",
        "vitessce/vitessce_ref.h5Seurat",
        "vitessce/vitessce_ref.h5ad",
        directory("vitessce/vitessce_ref.zarr")


############################## BBI Downloads  ###########################################

rule download_BBI_metadata:
    params:
        cell_metadata = "https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/downloads/data_summarize_fetus_data/df_cell.RDS",
        gene_metadata = "https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/downloads/data_summarize_fetus_data/df_gene.RDS"
    output:
        cell_metadata = "data/df_cell.RDS",
        gene_metadata = "data/df_gene.RDS"
    shell:
        """
        mkdir -p logs
        wget {params.cell_metadata} -O {output.cell_metadata}
        wget {params.gene_metadata} -O {output.gene_metadata}
        echo "BBI metadata downloaded on: $(date)" > logs/download_bbi_metadata.log
        """

rule download_BBI_subsampled: 
    params:
        counts = "https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/downloads/data_summarize_fetus_data/gene_count_sampled.RDS"
    output:
        counts = "data/gene_count_sampled.RDS"
    shell:
        """
        mkdir -p logs
        wget {params.counts} -O {output.counts}
        echo "BBI subsampled data downloaded on: $(date)" > logs/download_bbi_subsampled.log
        """

rule download_BBI_individual_organs:
    params:
        organ = "https://atlas.fredhutch.org/data/bbi/descartes/human_gtex/downloads/data_summarize_fetus_data/{sample}_gene_count.RDS"
    output:
        organ = "data/{sample}_gene_count.RDS"
    shell:
        """
        mkdir -p logs
        wget {params.organ} -O {output.organ}
        echo "BBI data downloaded on: $(date)" > logs/download_{wildcards.sample}_data.log
        """

############################## Reference  ###########################################

rule preprocess_BBI_reference:
    input:
        script = "scripts/preprocess_BBI_reference.R",
        counts = "data/gene_count_sampled.RDS",
        cell_metadata = "data/df_cell.RDS",
        gene_metadata = "data/df_gene.RDS"
    output:
        "seurat_objects/ref_intermediate.Rds"
    shell:
        """
        Rscript {input.script} {input.counts} {input.cell_metadata} {input.gene_metadata} {output} > logs/preprocess_bbi.Rout 2>&1
        """

rule make_BBI_reference:
    input: 
        script = "scripts/make_BBI_reference.R",
        processed_ref = "seurat_objects/ref_intermediate.Rds",
        cell_metadata = "data/df_cell.RDS",
        gene_metadata = "data/df_gene.RDS"
    output:
        ref = "reference/ref.Rds",
        idx = "reference/idx.annoy",
        fullref = "seurat_objects/fullref.Rds"
    shell:
        """
        Rscript {input.script} {input.processed_ref} {input.cell_metadata} {input.gene_metadata} {output.ref} {output.idx} {output.fullref} > logs/make_bbi_reference.Rout 2>&1
        """


# ############################## Demos  ###########################################

rule make_BBI_query:
    input: 
        script = "scripts/setup_BBI_query.R",
        ref = "reference/ref.Rds",
        samples = expand("data/{sample}_gene_count.RDS", sample = ORGANS),
        cell_metadata = "data/df_cell.RDS",
        gene_metadata = "data/df_gene.RDS"
    output:
        "reference/bbi_query.rds"
    shell:
        """
        Rscript {input.script} {input.ref} {input.samples} {input.cell_metadata} {input.gene_metadata} {output} > logs/make_bbi_query.Rout 2>&1
        """

rule setup_enge:
    input:
        script = "scripts/setup_enge.R",
        tar = "data/enge_data/enge.tar"
    params:
        path = "data/enge_data/"
    output:
        "reference/enge.rds"
    shell:
        """
        Rscript {input.script} {params.path} {output} > logs/setup_enge.Rout 2>&1
        """
        
rule setup_asp:
    input:
        script = "scripts/setup_asp.R",
        counts = "data/asp_data/counts_filtered.tsv",
        metadata = "data/asp_data/md_filtered.tsv"
    output:
        "reference/asp.rds"
    shell:
        """
        Rscript {input.script} {input.counts} {input.metadata} {output} > logs/setup_asp.Rout 2>&1
        """

rule setup_lind:
    input:
        script = "scripts/setup_lind.R",
        data = "data/lind_data/counts.tsv"
    output:
        "reference/lind.rds"
    shell:
        """
        mkdir -p logs
        Rscript {input.script} {input.data} {output} > logs/setup_lind.Rout 2>&1
        """

rule download_enge:
    output:
        directory("data/enge_data/"),
        "data/enge_data/enge.tar"
    shell:
        """
        mkdir -p data/enge_data
        cd data/enge_data
        wget 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE81547&format=file' -O enge.tar
        tar -xvf enge.tar
        echo "Enge data downloaded on: $(date)" > ../../logs/download_enge.log
        """

rule download_asp:
    output:
        "data/asp_data/counts_filtered.tsv",
        "data/asp_data/md_filtered.tsv"
    shell:
        """
        mkdir -p data/asp_data
        cd data/asp_data
        wget 'https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/mbvhhf8m62-2.zip'
        unzip mbvhhf8m62-2.zip
        unzip Filtered/'Developmental_heart_filtered_scRNA-seq_and_meta_data.zip'
        gunzip share_files/*
        mv share_files/all_cells_count_matrix_filtered.tsv counts_filtered.tsv
        mv share_files/all_cells_meta_data_filtered.tsv md_filtered.tsv
        echo "ASP data downloaded on: $(date)" > ../../logs/download_asp.log
        """

rule download_lind:
    output:
        "data/lind_data/counts.tsv"
    shell:
        """
        mkdir -p logs
	mkdir -p data/lind_data
        cd data/lind_data
        wget 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE102596&format=file' -O lind.tar
        tar -xvf lind.tar
        gunzip GSM2741551_count-table-human16w.tsv.gz
        mv GSM2741551_count-table-human16w.tsv counts.tsv
        echo "Lind data downloaded on: $(date)" > ../../logs/download_lind.log
        """

# ############################## Zarr  ###########################################

rule export_zarr:
    input:
        ref = "reference/ref.Rds",
        fullref = "seurat_objects/fullref.Rds",
        script1 = "scripts/convert_to_h5ad.R",
        script2 = "scripts/convert_to_zarr.py"
    output:
        h5Seurat = "vitessce/vitessce_ref.h5Seurat",
        h5ad = "vitessce/vitessce_ref.h5ad",
        zarr = directory("vitessce/vitessce_ref.zarr")
    container:
        "docker://satijalab/azimuth-references:vitessce"
    shell:
        """
        mkdir -p vitessce
        Rscript {input.script1} {input.ref} {input.fullref} {output.h5Seurat} > logs/export_zarr_anndata.Rout 2>&1
        python3 {input.script2} {output.h5ad} {output.zarr}
        """


